{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import joblib\n",
    "import pandas as pd \n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(33465, 6993)"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_nodup = joblib.load('../../scrum_data/train_data/train_nodup.lz4')\n",
    "data_tag = joblib.load('../../scrum_data/train_data/train_tag.lz4')\n",
    "data_date = joblib.load('../../scrum_data/train_data/train_date.lz4')\n",
    "data_week =joblib.load('../../scrum_data/train_data/train_week.lz4')\n",
    "data_null_sign = joblib.load('../../scrum_data/train_data/train_null_sign90.lz4')\n",
    "data_majcnt = joblib.load('../../scrum_data/train_data/train_majcnt.lz4')\n",
    "data_cf_fs = joblib.load('../../scrum_data/train_data/train_cf_86_fs10.lz4')\n",
    "\n",
    "data = pd.concat([data_nodup,data_tag,data_date,data_week,data_null_sign,data_majcnt,data_cf_fs],axis=1,copy=False)\n",
    "data_label = pd.read_csv('../../preprocess_data/train_y_33465.csv',usecols=['label'])\n",
    "x_df = data.fillna(-1)\n",
    "y = data_label.values.ravel()\n",
    "x_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['y_train.lz4']"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "joblib.dump(x_df,'x_trian.lz4',compress='lz4')\n",
    "joblib.dump(y,'y_train.lz4',compress='lz4')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n",
       "       colsample_bytree=1, early_stopping_rounds=100, eval_metric='auc',\n",
       "       gamma=1, learning_rate=0.05, max_delta_step=0, max_depth=5,\n",
       "       min_child_weight=2, missing=None, n_estimators=250, n_jobs=18,\n",
       "       nthread=None, objective='binary:logistic', random_state=0,\n",
       "       reg_alpha=0, reg_lambda=1, scale_pos_weight=14.22520473157416,\n",
       "       seed=2018, silent=False, subsample=0.9)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from xgboost import XGBClassifier\n",
    "xgb = XGBClassifier(n_estimators = 250,\n",
    "                      max_depth = 5,\n",
    "                      min_child_weight = 2,\n",
    "                      learning_rate =0.05, \n",
    "                      subsample = 0.9,\n",
    "                      booster='gbtree',\n",
    "                      objective='binary:logistic',\n",
    "                      early_stopping_rounds=100,\n",
    "                      scale_pos_weight=float(len(y)-np.sum(y))/float(np.sum(y)),\n",
    "                      eval_metric='auc',\n",
    "                      gamma=1,\n",
    "                      reg_lambda=1,\n",
    "                      seed=2018,\n",
    "                      silent=False,\n",
    "                      n_jobs=18\n",
    "                             )\n",
    "xgb.fit(x_df.values,y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['feature_names']"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "joblib.dump(xgb,'xgb_model')\n",
    "joblib.dump(data.columns,'feature_names')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
